import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
df=pd.read_table('fruit_data_with_colors.txt')
df.head()
| fruit_label | fruit_name | fruit_subtype | mass | width | height | color_score | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | apple | granny_smith | 192 | 8.4 | 7.3 | 0.55 |
| 1 | 1 | apple | granny_smith | 180 | 8.0 | 6.8 | 0.59 |
| 2 | 1 | apple | granny_smith | 176 | 7.4 | 7.2 | 0.60 |
| 3 | 2 | mandarin | mandarin | 86 | 6.2 | 4.7 | 0.80 |
| 4 | 2 | mandarin | mandarin | 84 | 6.0 | 4.6 | 0.79 |
df.fruit_name.unique()
array(['apple', 'mandarin', 'orange', 'lemon'], dtype=object)
df.fruit_subtype.unique()
array(['granny_smith', 'mandarin', 'braeburn', 'golden_delicious',
'cripps_pink', 'spanish_jumbo', 'selected_seconds', 'turkey_navel',
'spanish_belsan', 'unknown'], dtype=object)
df.shape
(59, 7)
df.describe()
| fruit_label | mass | width | height | color_score | |
|---|---|---|---|---|---|
| count | 59.000000 | 59.000000 | 59.000000 | 59.000000 | 59.000000 |
| mean | 2.542373 | 163.118644 | 7.105085 | 7.693220 | 0.762881 |
| std | 1.208048 | 55.018832 | 0.816938 | 1.361017 | 0.076857 |
| min | 1.000000 | 76.000000 | 5.800000 | 4.000000 | 0.550000 |
| 25% | 1.000000 | 140.000000 | 6.600000 | 7.200000 | 0.720000 |
| 50% | 3.000000 | 158.000000 | 7.200000 | 7.600000 | 0.750000 |
| 75% | 4.000000 | 177.000000 | 7.500000 | 8.200000 | 0.810000 |
| max | 4.000000 | 362.000000 | 9.600000 | 10.500000 | 0.930000 |
df.describe().max()
fruit_label 59.0 mass 362.0 width 59.0 height 59.0 color_score 59.0 dtype: float64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 59 entries, 0 to 58 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fruit_label 59 non-null int64 1 fruit_name 59 non-null object 2 fruit_subtype 59 non-null object 3 mass 59 non-null int64 4 width 59 non-null float64 5 height 59 non-null float64 6 color_score 59 non-null float64 dtypes: float64(3), int64(2), object(2) memory usage: 3.4+ KB
q_low = df["width"].quantile(0.25)
q_hi = df["width"].quantile(0.75)
df_filtered = df[(df["width"] < q_hi) & (df["width"] > q_low)]
plt.scatter(df_filtered['mass'],df_filtered['width'])
<matplotlib.collections.PathCollection at 0x7fd13d0df310>
plt.plot(df['height'],label='height')
plt.plot(df['mass'],label='mass')
plt.plot(df['width'],label='width')
plt.plot(df['color_score'],label='color_score')
plt.legend()
<matplotlib.legend.Legend at 0x7fd13d204f40>
plt.plot(df['width'],label='width')
plt.plot(df['color_score'],label='color_score')
plt.plot(df['height'],label='height')
plt.legend()
<matplotlib.legend.Legend at 0x7fd13d2a2fd0>
#lets play with plotly
import plotly.express as px
fig = px.scatter_3d(df, x='mass', y='width', z='height',
color='color_score',symbol='fruit_name',opacity=0.7)
fig.show()
lookup_fruits_name = dict(zip(df.fruit_label.unique(),df.fruit_name.unique()))
lookup_fruits_name
{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}
df.isna().sum()
fruit_label 0 fruit_name 0 fruit_subtype 0 mass 0 width 0 height 0 color_score 0 dtype: int64
df.duplicated().sum()
0
#df.drop_duplicates()
#df.dropna()
## Position of the Outlier
#print(np.where(df['Mass']>10))
# Analysing the co-relation between different features
df.corr()
| fruit_label | mass | width | height | color_score | |
|---|---|---|---|---|---|
| fruit_label | 1.000000 | 0.032738 | -0.298090 | 0.508766 | -0.310521 |
| mass | 0.032738 | 1.000000 | 0.877687 | 0.609571 | -0.079794 |
| width | -0.298090 | 0.877687 | 1.000000 | 0.396848 | -0.076576 |
| height | 0.508766 | 0.609571 | 0.396848 | 1.000000 | -0.247047 |
| color_score | -0.310521 | -0.079794 | -0.076576 | -0.247047 | 1.000000 |
df.tail()
| fruit_label | fruit_name | fruit_subtype | mass | width | height | color_score | |
|---|---|---|---|---|---|---|---|
| 54 | 4 | lemon | unknown | 116 | 6.1 | 8.5 | 0.71 |
| 55 | 4 | lemon | unknown | 116 | 6.3 | 7.7 | 0.72 |
| 56 | 4 | lemon | unknown | 116 | 5.9 | 8.1 | 0.73 |
| 57 | 4 | lemon | unknown | 152 | 6.5 | 8.5 | 0.72 |
| 58 | 4 | lemon | unknown | 118 | 6.1 | 8.1 | 0.70 |
df["fruit_subtype"]=="unknown"
0 False 1 False 2 False 3 False 4 False 5 False 6 False 7 False 8 False 9 False 10 False 11 False 12 False 13 False 14 False 15 False 16 False 17 False 18 False 19 False 20 False 21 False 22 False 23 False 24 False 25 False 26 False 27 False 28 False 29 False 30 False 31 False 32 False 33 False 34 False 35 False 36 False 37 False 38 False 39 False 40 False 41 False 42 False 43 False 44 False 45 False 46 False 47 False 48 False 49 True 50 True 51 True 52 True 53 True 54 True 55 True 56 True 57 True 58 True Name: fruit_subtype, dtype: bool
(df["fruit_subtype"]=="unknown").sum()
10
# Column which is not contributing we have to drop it
df.drop('fruit_subtype',axis=1,inplace=True)
df.head()
| fruit_label | fruit_name | mass | width | height | color_score | |
|---|---|---|---|---|---|---|
| 0 | 1 | apple | 192 | 8.4 | 7.3 | 0.55 |
| 1 | 1 | apple | 180 | 8.0 | 6.8 | 0.59 |
| 2 | 1 | apple | 176 | 7.4 | 7.2 | 0.60 |
| 3 | 2 | mandarin | 86 | 6.2 | 4.7 | 0.80 |
| 4 | 2 | mandarin | 84 | 6.0 | 4.6 | 0.79 |
X = df[['mass', 'width', 'height', 'color_score']]
y = df['fruit_label']
df.shape
(59, 6)
#Select the ratio
ratio = 0.75
total_rows = df.shape[0]
train_size = int(total_rows*ratio)
# Split data into test and train
X_train = X[0:train_size]
X_test = X[train_size:]
X_train.shape,X_test.shape
((44, 4), (15, 4))
# Split data into test and train
y_train = y[0:train_size]
y_test = y[train_size:]
from sklearn.model_selection import train_test_split
X = df[['mass', 'width', 'height', 'color_score']]
y = df['fruit_label']
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.75,random_state=0)
# Doing Standardization for increasing the speed of our model by reducing distance.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test =scaler.transform(X_test)
# Importing our classification model
# here n_neighbors denotes the no. of k , we have taken it as 5 because it is the most used one.
# Then train the model
from sklearn.neighbors import KNeighborsClassifier
knmodel = KNeighborsClassifier(n_neighbors= 5)
knmodel.fit(X_train,y_train)
KNeighborsClassifier()
# predicted the values with the help of testing dataset
y_pred = knmodel.predict(X_test)
print(knmodel.predict(X_test))
[3 3 4 3 1 1 3 4 3 1 2 1 3 3 1]
print(y_pred[0])
3
lookup_fruits_name[y_pred[0]]
'orange'
#optional
# I have used in order to see whether they are overlapping or not
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(y_pred, y_test)
<matplotlib.collections.PathCollection at 0x7fd1441b86a0>
# Although we can check our accuracy in our above step but if you want to check it more precisly
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)*100
93.33333333333333